library(pROC)
library(ggpubr)
library(Biostrings)
library(data.table)
library(dplyr)
library(caret)
library(purrr)
library(tidyverse)
library(yardstick)
library(doParallel)
library(foreach)
library(stringdist)
library(magrittr)
# Function to calculate the F score, given beta, precision and recall.
calculate_f_beta = function(beta, precision, recall) {
return((beta^2 + 1)*(precision*recall / ((beta^2)*precision + recall)))
}
combinedData = readRDS("PanHLA_combinedData.rds") %>% as.data.table
combinedData %>% DT::datatable(caption="Model Results")
## Warning in instance$preRenderHook(instance): It seems your data is too big
## for client-side DataTables. You may consider server-side processing: https://
## rstudio.github.io/DT/server.html
FullDataset = readRDS("PanHLA_FullDataset.rds")
# write.table(combinedData,"PanHLA_combinedData.txt",sep="\t",row.names = F,quote=F)
# write.table(FullDataset,"PanHLA_FullDataset.txt",sep="\t",row.names = F,quote=F)
# What is the accuracy of each model?
accuracyDF=combinedData %>% dplyr::group_by(Dataset) %>% summarise(Accuracy=sum(ImmunogenicityPrediction==Immunogenicity)/nrow(FullDataset))
# What is the ROC-AUC of each model?
AUCDF = combinedData %>% group_by(Dataset) %>% dplyr::summarise(ROC=as.numeric(roc(Immunogenicity ~ ImmunogenicityScore)$auc))
## Setting levels: control = Positive, case = Negative
## Setting direction: controls > cases
## Setting levels: control = Positive, case = Negative
## Setting direction: controls > cases
## Setting levels: control = Positive, case = Negative
## Setting direction: controls > cases
## Setting levels: control = Positive, case = Negative
## Setting direction: controls > cases
## Setting levels: control = Positive, case = Negative
## Setting direction: controls > cases
NETTEPIAUC=roc(Immunogenicity ~ ImmunogenicityScore,data=combinedData %>% filter(Dataset %in% 'NETTEPI'))
## Setting levels: control = Positive, case = Negative
## Setting direction: controls > cases
IPREDAUC=roc(Immunogenicity ~ ImmunogenicityScore,data=combinedData %>% filter(Dataset %in% 'IPRED'))
## Setting levels: control = Positive, case = Negative
## Setting direction: controls > cases
IEDBMODELAUC=roc(Immunogenicity ~ ImmunogenicityScore,data=combinedData %>% filter(Dataset %in% 'IEDB'))
## Setting levels: control = Positive, case = Negative
## Setting direction: controls > cases
REPITOPE_AUC_CV=roc(Immunogenicity ~ ImmunogenicityScore,data=combinedData %>% filter(Dataset %in% 'REPITOPE'))
## Setting levels: control = Positive, case = Negative
## Setting direction: controls > cases
NetTepi_NETMHC.AUC.CV = roc(Immunogenicity ~ ImmunogenicityScore,data=combinedData %>% filter(Dataset %in% 'NETTEPI_netMHCpan4'))
## Setting levels: control = Positive, case = Negative
## Setting direction: controls > cases
# Create ggroc object
roc.AUC=ggroc(list(IEDB_Model=IEDBMODELAUC,iPred=IPREDAUC,NetTepi=NETTEPIAUC,NetTepi_NetMHCpan4=NetTepi_NETMHC.AUC.CV,REpitope=REPITOPE_AUC_CV),legacy.axes = TRUE,size=1.25) + theme_bw() +
annotate("size"=5,"text",x=.55,y=.185,label=paste0("IEDB: ",round(auc(IEDBMODELAUC),digits=3),"\n","iPred: ",round(auc(IPREDAUC),digits=3),"\n","NetTepi: ",round(auc(NETTEPIAUC),digits=3),"\n","NetTepi_NetMHCpan4: ",round(auc(NetTepi_NETMHC.AUC.CV),digits=3),"\n","Repitope: ",round(auc(REPITOPE_AUC_CV),digits=3))) + font("xy.text",size=16,color="black")+ font("xlab",size=16,color="black")+ font("ylab",size=16,color="black") + font("legend.title",color="white") + font("legend.text",size=12) + geom_abline(size=1,intercept = 0, slope = 1,color = "darkgrey", linetype = "dashed")
# What is each models PR-AUC?
combinedData$Immunogenicity = factor(combinedData$Immunogenicity,levels = c("Positive","Negative"))
PR_AUC_COMBINED=combinedData %>% group_by(Dataset) %>% pr_auc(Immunogenicity,ImmunogenicityScore)
PR_AUC_COMBINED$.estimate=round(PR_AUC_COMBINED$.estimate,digits=3)
pr.AUC=combinedData %>% group_by(Dataset) %>% pr_curve(Immunogenicity,ImmunogenicityScore) %>% autoplot() + aes(size = Dataset)+scale_size_manual(values=c(1.25,1.25,1.25,1.25,1.25)) + ggtitle("Precision-Recall Curves") +annotate("size"=5,"text",x=.5,y=.155,label=paste0("IEDB: ",PR_AUC_COMBINED$.estimate[1],"\n","iPred: ",PR_AUC_COMBINED$.estimate[2],"\n","NetTepi: ",PR_AUC_COMBINED$.estimate[3],"\n","NetTepi_NetMHCpan4: ",PR_AUC_COMBINED$.estimate[4],"\n","Repitope: ",PR_AUC_COMBINED$.estimate[5])) + geom_hline(size=1,color="darkgrey",yintercept = nrow(FullDataset[FullDataset$Immunogenicity=='Positive',]) / nrow(FullDataset),linetype="dashed")+ font("xy.text",size=16,color="black")+ font("xlab",size=16,color="black")+ font("ylab",size=16,color="black") + font("legend.title",color="white") + font("legend.text",size=10)
gl <- lapply(list(pr.AUC,roc.AUC), ggplotGrob)
library(grid)
widths <- do.call(unit.pmax, lapply(gl, "[[", "widths"))
heights <- do.call(unit.pmax, lapply(gl, "[[", "heights"))
lg <- lapply(gl, function(g) {g$widths <- widths; g$heights <- heights; g})
grid.newpage()
grid.draw(lg[[1]])
grid.newpage()
grid.draw(lg[[2]])
combinedData$Immunogenicity = factor(combinedData$Immunogenicity,levels=c("Positive","Negative"))
combinedData$ImmunogenicityPrediction = factor(combinedData$ImmunogenicityPrediction,levels=c("Positive","Negative"))
F1SCORE=combinedData %>% group_by(Dataset) %>% dplyr::summarise(F1_SCORE= confusionMatrix(positive = "Positive",mode = "prec_recall",
reference=Immunogenicity,ImmunogenicityPrediction)$byClass["F1"])
PRECISIONCV=combinedData %>% group_by(Dataset) %>% dplyr::summarise(Precision= confusionMatrix(positive = "Positive",mode = "prec_recall",
reference=Immunogenicity,ImmunogenicityPrediction)$byClass["Precision"])
RECALLCV=combinedData %>% group_by(Dataset) %>% dplyr::summarise(Recall= confusionMatrix(positive = "Positive",mode = "prec_recall",
reference=Immunogenicity,ImmunogenicityPrediction)$byClass["Recall"]) %>% as.data.table
BALACCURACYCV=combinedData %>% group_by(Dataset) %>% dplyr::summarise(BalancedAccuracy= confusionMatrix(positive = "Positive",mode = "prec_recall",
reference=Immunogenicity,ImmunogenicityPrediction)$byClass["Balanced Accuracy"])
F0.5SCORE=inner_join(RECALLCV,PRECISIONCV) %>% group_by(Dataset) %>% dplyr::summarise(F_0.5 = calculate_f_beta(0.5,Precision,Recall))
## Joining, by = "Dataset"
accuracyDF %>% inner_join(F1SCORE) %>% inner_join(PRECISIONCV) %>% inner_join(RECALLCV) %>% inner_join(F0.5SCORE) %>% inner_join(BALACCURACYCV) %>% melt(variable.name="Metric") %>% ggbarplot(
x="Dataset",y="value",fill="Metric",label = T,lab.nb.digits = 2,alpha=0.6
) + facet_wrap(~Metric) + rotate_x_text(angle=90) + ylim(0,1)
## Joining, by = "Dataset"
## Joining, by = "Dataset"
## Joining, by = "Dataset"
## Joining, by = "Dataset"
## Joining, by = "Dataset"
## Warning in melt(., variable.name = "Metric"): The melt generic in data.table
## has been passed a tbl_df and will attempt to redirect to the relevant reshape2
## method; please note that reshape2 is deprecated, and this redirection is now
## deprecated as well. To continue using melt methods from reshape2 while both
## libraries are attached, e.g. melt.list, you can prepend the namespace like
## reshape2::melt(.). In the next version, this warning will become an error.
## Using Dataset as id variables
## Warning: attributes are not identical across measure variables; they will be
## dropped
CM=confusionMatrix(positive = "Positive",mode = "prec_recall",reference=factor(combinedData %>% filter(Dataset %in% 'IEDB') %>% select(Immunogenicity) %>% pull(),
levels = c("Negative","Positive")),
factor(combinedData %>% filter(Dataset %in% 'IEDB') %>% select(ImmunogenicityPrediction) %>% pull(),
levels=c("Negative","Positive")))
CM
## Confusion Matrix and Statistics
##
## Reference
## Prediction Negative Positive
## Negative 2670 882
## Positive 2172 1797
##
## Accuracy : 0.5939
## 95% CI : (0.5827, 0.6051)
## No Information Rate : 0.6438
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2006
##
## Mcnemar's Test P-Value : <2e-16
##
## Precision : 0.4528
## Recall : 0.6708
## F1 : 0.5406
## Prevalence : 0.3562
## Detection Rate : 0.2389
## Detection Prevalence : 0.5277
## Balanced Accuracy : 0.6111
##
## 'Positive' Class : Positive
##
table=data.frame(CM$table)
plotTable <- table %>%
mutate(Performance = ifelse(table$Prediction == table$Reference, "Accurate", "Inaccurate")) %>%
group_by(Reference) %>%
mutate(prop = Freq/sum(Freq))
ggplot(data = plotTable, mapping = aes(x = Reference, y = Prediction, fill = Performance, alpha = prop)) +
geom_tile() +
geom_text(aes(label = Freq), vjust = .5, fontface = "bold", alpha = 1,size=8) +
scale_fill_manual(values = c(Accurate = "green", Inaccurate = "red")) +
theme_bw() +
xlim(rev(levels(table$Reference))) + ggtitle("IEDB Immunogenicity Model")+ font("xy.text",size=16,color="black")+ font("xlab",size=16,color="black")+ font("ylab",size=16,color="black") + font("legend.text",size=10)
Figure 3.1: IEDB Confusion Matrix. Color is relative to sensitivity/specificity by proportional outcomes within reference groups
CM=confusionMatrix(positive = "Positive",mode = "prec_recall",reference=factor(combinedData %>% filter(Dataset %in% 'NETTEPI') %>% select(Immunogenicity) %>% pull(),
levels = c("Negative","Positive")),
factor(combinedData %>% filter(Dataset %in% 'NETTEPI') %>% select(ImmunogenicityPrediction) %>% pull(),
levels=c("Negative","Positive")))
CM
## Confusion Matrix and Statistics
##
## Reference
## Prediction Negative Positive
## Negative 3173 1053
## Positive 1669 1626
##
## Accuracy : 0.6381
## 95% CI : (0.6271, 0.649)
## No Information Rate : 0.6438
## P-Value [Acc > NIR] : 0.8525
##
## Kappa : 0.2494
##
## Mcnemar's Test P-Value : <2e-16
##
## Precision : 0.4935
## Recall : 0.6069
## F1 : 0.5444
## Prevalence : 0.3562
## Detection Rate : 0.2162
## Detection Prevalence : 0.4381
## Balanced Accuracy : 0.6311
##
## 'Positive' Class : Positive
##
table=data.frame(CM$table)
plotTable <- table %>%
mutate(Performance = ifelse(table$Prediction == table$Reference, "Accurate", "Inaccurate")) %>%
group_by(Reference) %>%
mutate(prop = Freq/sum(Freq))
ggplot(data = plotTable, mapping = aes(x = Reference, y = Prediction, fill = Performance, alpha = prop)) +
geom_tile() +
geom_text(aes(label = Freq), vjust = .5, fontface = "bold", alpha = 1,size=8) +
scale_fill_manual(values = c(Accurate = "green", Inaccurate = "red")) +
theme_bw() +
xlim(rev(levels(table$Reference))) + ggtitle("NetTepi")+ font("xy.text",size=16,color="black")+ font("xlab",size=16,color="black")+ font("ylab",size=16,color="black") + font("legend.text",size=10)
Figure 3.2: NetTepi Confusion Matrix. Color is relative to sensitivity/specificity by proportional outcomes within reference groups
CM=confusionMatrix(positive = "Positive",mode = "prec_recall",reference=factor(combinedData %>% filter(Dataset == 'NETTEPI_netMHCpan4') %>% select(Immunogenicity) %>% pull(),
levels = c("Negative","Positive")),
factor(combinedData %>% filter(Dataset == 'NETTEPI_netMHCpan4') %>% select(ImmunogenicityPrediction) %>% pull(),
levels=c("Negative","Positive")))
CM
## Confusion Matrix and Statistics
##
## Reference
## Prediction Negative Positive
## Negative 3085 996
## Positive 1757 1683
##
## Accuracy : 0.634
## 95% CI : (0.623, 0.6449)
## No Information Rate : 0.6438
## P-Value [Acc > NIR] : 0.9634
##
## Kappa : 0.2495
##
## Mcnemar's Test P-Value : <2e-16
##
## Precision : 0.4892
## Recall : 0.6282
## F1 : 0.5501
## Prevalence : 0.3562
## Detection Rate : 0.2238
## Detection Prevalence : 0.4574
## Balanced Accuracy : 0.6327
##
## 'Positive' Class : Positive
##
table=data.frame(CM$table)
plotTable <- table %>%
mutate(Performance = ifelse(table$Prediction == table$Reference, "Accurate", "Inaccurate")) %>%
group_by(Reference) %>%
mutate(prop = Freq/sum(Freq))
ggplot(data = plotTable, mapping = aes(x = Reference, y = Prediction, fill = Performance, alpha = prop)) +
geom_tile() +
geom_text(aes(label = Freq), vjust = .5, fontface = "bold", alpha = 1,size=8) +
scale_fill_manual(values = c(Accurate = "green", Inaccurate = "red")) +
theme_bw() +
xlim(rev(levels(table$Reference)))+ font("xy.text",size=16,color="black")+ font("xlab",size=16,color="black")+ font("ylab",size=16,color="black") + font("legend.text",size=10) + ggtitle("NETTEPI_netMHCpan4")
Figure 3.3: NetTepi Confusion Matrix. Color is relative to sensitivity/specificity by proportional outcomes within reference groups
CM=confusionMatrix(positive = "Positive",mode = "prec_recall",reference=factor(combinedData %>% filter(Dataset == 'IPRED') %>% select(Immunogenicity) %>% pull(),
levels = c("Negative","Positive")),
factor(combinedData %>% filter(Dataset == 'IPRED') %>% select(ImmunogenicityPrediction) %>% pull(),
levels=c("Negative","Positive")))
CM
## Confusion Matrix and Statistics
##
## Reference
## Prediction Negative Positive
## Negative 2885 731
## Positive 1957 1948
##
## Accuracy : 0.6426
## 95% CI : (0.6316, 0.6534)
## No Information Rate : 0.6438
## P-Value [Acc > NIR] : 0.5909
##
## Kappa : 0.293
##
## Mcnemar's Test P-Value : <2e-16
##
## Precision : 0.4988
## Recall : 0.7271
## F1 : 0.5917
## Prevalence : 0.3562
## Detection Rate : 0.2590
## Detection Prevalence : 0.5192
## Balanced Accuracy : 0.6615
##
## 'Positive' Class : Positive
##
table=data.frame(CM$table)
plotTable <- table %>%
mutate(Performance = ifelse(table$Prediction == table$Reference, "Accurate", "Inaccurate")) %>%
group_by(Reference) %>%
mutate(prop = Freq/sum(Freq))
ggplot(data = plotTable, mapping = aes(x = Reference, y = Prediction, fill = Performance, alpha = prop)) +
geom_tile() +
geom_text(aes(label = Freq), vjust = .5, fontface = "bold", alpha = 1,size=8) +
scale_fill_manual(values = c(Accurate = "green", Inaccurate = "red")) +
theme_bw() +
xlim(rev(levels(table$Reference))) + ggtitle("iPred")+ font("xy.text",size=16,color="black")+ font("xlab",size=16,color="black")+ font("ylab",size=16,color="black") + font("legend.text",size=10)
Figure 3.4: iPred Confusion Matrix. Color is relative to sensitivity/specificity by proportional outcomes within reference groups
CM=confusionMatrix(positive = "Positive",mode = "prec_recall",reference=factor(combinedData %>% filter(Dataset == 'REPITOPE') %>% select(Immunogenicity) %>% pull(),
levels = c("Negative","Positive")),
factor(combinedData %>% filter(Dataset == 'REPITOPE') %>% select(ImmunogenicityPrediction) %>% pull(),
levels=c("Negative","Positive")))
CM
## Confusion Matrix and Statistics
##
## Reference
## Prediction Negative Positive
## Negative 3615 977
## Positive 1227 1702
##
## Accuracy : 0.707
## 95% CI : (0.6965, 0.7172)
## No Information Rate : 0.6438
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.3741
##
## Mcnemar's Test P-Value : 1.134e-07
##
## Precision : 0.5811
## Recall : 0.6353
## F1 : 0.6070
## Prevalence : 0.3562
## Detection Rate : 0.2263
## Detection Prevalence : 0.3894
## Balanced Accuracy : 0.6910
##
## 'Positive' Class : Positive
##
table=data.frame(CM$table)
plotTable <- table %>%
mutate(Performance = ifelse(table$Prediction == table$Reference, "Accurate", "Inaccurate")) %>%
group_by(Reference) %>%
mutate(prop = Freq/sum(Freq))
ggplot(data = plotTable, mapping = aes(x = Reference, y = Prediction, fill = Performance, alpha = prop)) +
geom_tile() +
geom_text(aes(label = Freq), vjust = .5, fontface = "bold", alpha = 1,size=8) +
scale_fill_manual(values = c(Accurate = "green", Inaccurate = "red")) +
theme_bw() +
xlim(rev(levels(table$Reference))) + ggtitle("REpitope")+ font("xy.text",size=16,color="black")+ font("xlab",size=16,color="black")+ font("ylab",size=16,color="black") + font("legend.text",size=10)
Figure 3.5: REpitope Confusion Matrix. Color is relative to sensitivity/specificity by proportional outcomes within reference groups